/**
 * Author: WuLC
 * Date:   2016-05-25 09:18:09
 * Last modified by:   WuLC
 * Last Modified time: 2016-05-25 14:50:52
 * Email: liangchaowu5@gmail.com
 * *****************************************************
 * Function: combine TextRank and TF-IDF to extract keywords
 * Input: path of the directory of the corpus
 * Output: keywords extracted for each document
 */

package com.ajaxjs.nlp;

import com.ajaxjs.nlp.parsedoc.ReadDir;
import com.ajaxjs.nlp.parsedoc.ReadFile;

import java.util.*;


public class TextRankWithTFIDF {
    private static int keywordsNumber = 5;
    private static int keywordCandidateNum = 10;

    /**
     * set the number of keywords to extract
     *
     * @param number(int): number of keywords to extract
     */
    public static void setKeywordsNumber(int number) {
        keywordsNumber = number;
        keywordCandidateNum = 2 * number;
    }

    /**
     * multiply the TextRank-socre of a word by the IDF value of this word in a corpus
     *
     * @param dirPath(String): path of the directory of the corpus
     * @return keywords of each document of the corpus
     */
    public static Map<String, List<String>> textRankMultiplyIDF(String dirPath) {
        Map<String, List<String>> result = new HashMap<>();

        // get the IDF values for the words of a corpus
        Map<String, Float> idfForDir = TFIDF.idfForDir(dirPath);
        List<String> fileList = ReadDir.readDirFileNames(dirPath);
        String content;

        for (String file : fileList) {
            content = ReadFile.loadFile(file);
            Map<String, Float> trKeywords = TextRank.getWordScore("", content);
            Iterator<Map.Entry<String, Float>> it = trKeywords.entrySet().iterator();

            while (it.hasNext()) {
                Map.Entry<String, Float> temp = it.next();
                String key = temp.getKey();
                trKeywords.put(key, temp.getValue() * idfForDir.get(key));
            }

            //sort the words in terms of their score in descending order
            List<Map.Entry<String, Float>> entryList = new ArrayList<>(trKeywords.entrySet());
            Collections.sort(entryList, new Comparator<Map.Entry<String, Float>>() {
                public int compare(Map.Entry<String, Float> c1, Map.Entry<String, Float> c2) {
                    return c2.getValue().compareTo(c1.getValue());
                }

            });

            List<String> temp = new ArrayList<>();
            for (int i = 0; i < keywordsNumber; i++)
                temp.add(entryList.get(i).getKey());

            result.put(file, temp);
        }
        return result;
    }

    /**
     * integrate the results generated by TextRank and TF-IDF, choose those words that co-occure in both
     * results, if the number of co-occuring words is not enough, choose the left part from the results of TF-IDF
     *
     * @param dirPath(String): path of the directory of the corpus
     * @return keywords of each document of the corpus
     */
    public static Map<String, List<String>> textRankTFIDFVote(String dirPath) {
        Map<String, List<String>> result = new HashMap<>();
        List<String> fileList = ReadDir.readDirFileNames(dirPath);

        // get keywords generated by TF-IDF
        TFIDF.setKeywordsNumber(keywordCandidateNum);
        Map<String, List<String>> tfidfKeywordsForDir = TFIDF.getKeywords(dirPath);

        List<String> trKeyword;
        List<String> tfidfKeyword;
        String content;

        for (String file : fileList) {
            content = ReadFile.loadFile(file);
            trKeyword = TextRank.getKeyword("", content);
            tfidfKeyword = tfidfKeywordsForDir.get(file);

            List<String> temp = new ArrayList<>();

            for (String keyword : tfidfKeyword) {
                if (trKeyword.contains(keyword)) temp.add(keyword);
                if (temp.size() == keywordsNumber) break;
            }

            if (temp.size() == keywordsNumber) result.put(file, temp);
            else for (String keyword : tfidfKeyword) {
                if (!temp.contains(keyword)) temp.add(keyword);
                if (temp.size() == keywordsNumber) result.put(file, temp);
            }
        }

        return result;
    }

}
